In [22]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import time
import datetime
%matplotlib inline
Load data from processed dataset
In [23]:
matches_raw = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_Matches_Reduced.csv')
matches=matches_raw.copy()
print len(matches)
print matches.shape
In [24]:
matches.head()
Out[24]:
First job is to create a target variable.
There are 3 possible outcomes:
In [25]:
matches['Result_Target'] = np.nan
In [26]:
multiclass = False # Use this to set the target variables
In [27]:
if multiclass:
matches.ix[matches['home_team_goal'] > matches['away_team_goal'], 'Result_Target'] = 1
matches.ix[matches['home_team_goal'] < matches['away_team_goal'], 'Result_Target'] = -1
matches.ix[matches['home_team_goal'] == matches['away_team_goal'], 'Result_Target'] = 0
else:
matches.ix[matches['home_team_goal'] > matches['away_team_goal'], 'Result_Target'] = 1
matches.ix[matches['home_team_goal'] < matches['away_team_goal'], 'Result_Target'] = 0
matches.ix[matches['home_team_goal'] == matches['away_team_goal'], 'Result_Target'] = 0
In [108]:
matches['Result_Target'].value_counts().plot(kind = 'bar')
plt.xlabel('Result')
plt.ylabel('Frequency')
plt.title('Result_Target Distribution Frequency')
print matches['Result_Target'].value_counts()
In [29]:
matches.head() # Sanity check that the results column makes sense - Looks Ok!
Out[29]:
The function below returns 3 dictionaries for each season:
- Team Standings
- Team Goal Difference
- Team Results String (win, loss, draw)
In [30]:
def SeasonStanding(season):
stages = range(1,39)
points_dict = {}
goal_diff = {}
stage_dict = {}
goal_diff_stage = {}
team_results = {}
for stage in stages:
sub = season[season['stage']==stage]
sub = sub.reset_index(drop = True)
for i in range(len(sub)):
home_goal = sub['home_team_goal'][i]
away_goal = sub['away_team_goal'][i]
if home_goal > away_goal: ## home_win
# Points
if sub['home_team_api_id'][i] in points_dict.keys():
points_dict[sub['home_team_api_id'][i]] +=3
else:
points_dict[sub['home_team_api_id'][i]] = 3
if sub['away_team_api_id'][i] not in points_dict.keys():
points_dict[sub['away_team_api_id'][i]] = 0
# Results
if sub['home_team_api_id'][i] in team_results.keys():
team_results[sub['home_team_api_id'][i]].append('Win')
else:
team_results[sub['home_team_api_id'][i]] = ['Win']
if sub['away_team_api_id'][i] in team_results.keys():
team_results[sub['away_team_api_id'][i]].append('Loss')
else:
team_results[sub['away_team_api_id'][i]] = ['Loss']
# Goal Difference
if sub['home_team_api_id'][i] in goal_diff.keys():
goal_diff[sub['home_team_api_id'][i]] += home_goal - away_goal
else:
goal_diff[sub['home_team_api_id'][i]] = home_goal - away_goal
if sub['away_team_api_id'][i] in goal_diff.keys():
goal_diff[sub['away_team_api_id'][i]] += away_goal - home_goal
else:
goal_diff[sub['away_team_api_id'][i]] = away_goal - home_goal
continue
if home_goal < away_goal: ## away_win
# Points
if sub['away_team_api_id'][i] in points_dict.keys():
points_dict[sub['away_team_api_id'][i]] +=3
else:
points_dict[sub['away_team_api_id'][i]] =3
if sub['home_team_api_id'][i] not in points_dict.keys():
points_dict[sub['home_team_api_id'][i]] = 0
# Results
if sub['home_team_api_id'][i] in team_results.keys():
team_results[sub['home_team_api_id'][i]].append('Loss')
else:
team_results[sub['home_team_api_id'][i]] = ['Loss']
if sub['away_team_api_id'][i] in team_results.keys():
team_results[sub['away_team_api_id'][i]].append('Win')
else:
team_results[sub['away_team_api_id'][i]] = ['Win']
# Goal Difference
if sub['home_team_api_id'][i] in goal_diff.keys():
goal_diff[sub['home_team_api_id'][i]] += home_goal - away_goal
else:
goal_diff[sub['home_team_api_id'][i]] = home_goal - away_goal
if sub['away_team_api_id'][i] in goal_diff.keys():
goal_diff[sub['away_team_api_id'][i]] += away_goal - home_goal
else:
goal_diff[sub['away_team_api_id'][i]] = away_goal - home_goal
continue
if sub['home_team_goal'][i] == sub['away_team_goal'][i]:
# Points
if sub['away_team_api_id'][i] in points_dict.keys():
points_dict[sub['away_team_api_id'][i]] +=1
else:
points_dict[sub['away_team_api_id'][i]] = 1
if sub['home_team_api_id'][i] in points_dict.keys():
points_dict[sub['home_team_api_id'][i]] +=1
else:
points_dict[sub['home_team_api_id'][i]] =1
# Results
if sub['home_team_api_id'][i] in team_results.keys():
team_results[sub['home_team_api_id'][i]].append('Draw')
else:
team_results[sub['home_team_api_id'][i]] = ['Draw']
if sub['away_team_api_id'][i] in team_results.keys():
team_results[sub['away_team_api_id'][i]].append('Draw')
else:
team_results[sub['away_team_api_id'][i]] = ['Draw']
# Initialize Goal Difference if Needed
if sub['away_team_api_id'][i] not in goal_diff.keys():
goal_diff[sub['away_team_api_id'][i]] = 0
if sub['home_team_api_id'][i] not in goal_diff.keys():
goal_diff[sub['home_team_api_id'][i]] = 0
s_data = sorted(points_dict.items(), key=lambda item: item[1])
rank, count, previous, result = 0, 0, None, {}
for key, num in s_data:
count += 1
if num != previous:
rank += count
previous = num
count = 0
result[key] = 21-rank
stage_dict[stage] = result.copy()
goal_diff_stage[stage] = goal_diff.copy()
return stage_dict, goal_diff_stage,team_results
In [31]:
def LastKResults(team, stage, results_list,k):
if (stage-1)-k >= 0:
results = results_list[(stage-1)-k:stage-1]
wins_count = results.count('Win')
loss_count = results.count('Loss')
draw_count = results.count('Draw')
return wins_count, loss_count, draw_count
else:
wins_count = -1
loss_count = -1
draw_count = -1
return wins_count, loss_count, draw_count
In [32]:
def CreateSeasonStats(df_in,last_k):
df = df_in.copy() #make a copy of the input data to avoid setting copy on df slices
seasons = list(matches['season'].unique())
# set up master dictionaries
season_standings = {}
season_goal_difference = {}
season_results = {}
for season in seasons: # iterate thru seasons and append to master dictionaries
subset = df[df['season']==season]
standings, goal_difference, results = SeasonStanding(subset)
season_standings[season] = standings
season_goal_difference[season] = goal_difference
season_results[season] = results
home_team_standing = []
away_team_standing = []
home_team_goal_diff = []
away_team_goal_diff = []
home_dict = {}
away_dict = {}
for i in range(len(df)):
home_team = df['home_team_api_id'][i]
away_team = df['away_team_api_id'][i]
stage = df['stage'][i]
season = df['season'][i]
# Always lookup values for stage-1 because you cannot append values of the current game week. Always looking up to 1 week prior
# Standings Lookup
if stage == 1:
home_team_standing.append(-1)
else:
hts = season_standings[season][stage-1][home_team]
home_team_standing.append(hts)
if stage == 1:
away_team_standing.append(-1)
else:
ats = season_standings[season][stage-1][away_team]
away_team_standing.append(ats)
# Goal Difference Lookup
if stage == 1:
home_team_goal_diff.append(0)
else:
hgd = season_goal_difference[season][stage-1][home_team]
home_team_goal_diff.append(hgd)
if stage == 1:
away_team_goal_diff.append(0)
else:
agd = season_goal_difference[season][stage-1][away_team]
away_team_goal_diff.append(agd)
# Last K Results
home_team_results_list = season_results[season][home_team]
away_team_results_list = season_results[season][away_team]
last_k_range = range(1,last_k)
for k_val in last_k_range:
hwc,hlc,hdc = LastKResults(home_team,stage,home_team_results_list,k_val)
awc,alc,adc = LastKResults(away_team,stage,away_team_results_list,k_val)
if k_val not in home_dict.keys():
home_dict[k_val] = {'Wins' : [hwc],'Draws' : [hdc],'Losses':[hlc]}
away_dict[k_val] = {'Wins' : [awc],'Draws' : [adc], 'Losses':[alc]}
else:
home_dict[k_val]['Wins'].append(hwc)
home_dict[k_val]['Draws'].append(hdc)
home_dict[k_val]['Losses'].append(hlc)
away_dict[k_val]['Wins'].append(awc)
away_dict[k_val]['Draws'].append(adc)
away_dict[k_val]['Losses'].append(alc)
for val in home_dict.keys():
df['Home_Team_Last_%d_Wins'%(val)] = home_dict[val]['Wins']
df['Home_Team_Last_%d_Draws'%(val)] = home_dict[val]['Draws']
df['Home_Team_Last_%d_Losses'%(val)] = home_dict[val]['Losses']
df['Away_Team_Last_%d_Wins'%(val)] = away_dict[val]['Wins']
df['Away_Team_Last_%d_Draws'%(val)] = away_dict[val]['Draws']
df['Away_Team_Last_%d_Losses'%(val)] = away_dict[val]['Losses']
df['Home_Standing'] = home_team_standing
df['Away_Standing'] = away_team_standing
df['Home_Goal_Diff'] = home_team_goal_diff
df['Away_Goal_Diff'] = away_team_goal_diff
df['Standing_Diff'] = df['Home_Standing']- df['Away_Standing']
df['Diff_Goal_Diff'] = df['Home_Goal_Diff'] - df['Away_Goal_Diff']
return df
In [33]:
matches = CreateSeasonStats(matches,10)
matches.head(30)
Out[33]:
In [34]:
## Betting Odds Columns
odds = matches[matches.columns[9:27]].copy()
odds.head()
Out[34]:
In [35]:
home_odds = []
away_odds = []
draw_odds = []
for colname in odds.columns:
if colname in ['WHH','WHD','WHA']:
if colname == 'WHH':
home_odds.append(colname)
elif colname == 'WHD':
draw_odds.append(colname)
elif colname == 'WHA':
away_odds.append(colname)
continue
if 'H' in colname:
home_odds.append(colname)
elif 'A' in colname:
away_odds.append(colname)
else:
draw_odds.append(colname)
In [36]:
# Fill null values with the average of each row
matches['BWH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBH'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['BWD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBD'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['BWA'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['IWA'].fillna(odds[home_odds].mean(axis=1),inplace = True)
matches['LBA'].fillna(odds[home_odds].mean(axis=1),inplace = True)
In [39]:
matches['Average_Home_Odds'] = odds[home_odds].mean(axis=1).round(2)
matches['Average_Away_Odds'] = odds[away_odds].mean(axis=1).round(2)
matches['Average_Draw_Odds'] = odds[draw_odds].mean(axis=1).round(2)
In [40]:
## Remove extra betting columns
del matches['BWH']
del matches['IWH']
del matches['LBH']
del matches['BWD']
del matches['IWD']
del matches['LBD']
del matches['BWA']
del matches['IWA']
del matches['LBA']
del matches['B365H']
del matches['B365A']
del matches['B365D']
del matches['WHH']
del matches['WHD']
del matches['WHA']
del matches['VCH']
del matches['VCD']
del matches['VCA']
In [41]:
matches.columns
Out[41]:
In [42]:
matches.head()
Out[42]:
All columns look ok! Let's finally check and make sure there are no NAN values
In [43]:
null_dict = {}
for col in matches.columns:
nulls = matches[col].isnull().sum()
if nulls > 0:
null_dict[col] = nulls
null_dict
Out[43]:
This feature will create 4 new features
Games where last k value cannot be calculated, attribute will be imputed with -1
In [44]:
def LastKGoals(season_df,k):
## Function output:
## DataFrame {match_api_id, home_team_last_k_scored , home_team_last_k_conceded, away_team_last_k_scored, away_team_last_k_conceded}
home_team_last_k_scored = []
home_team_last_k_conceded = []
away_team_last_k_scored = []
away_team_last_k_conceded = []
match_api_id = []
for i in range(1,39):
# if last-k metric cannot be computed, impute with -1
if i <= k:
subset = season_df[season_df['stage'] == i]
match_id = subset['match_api_id'].tolist()
match_api_id.extend(match_id)
# Impute values of -1 where last-k metric cannot be computed
home_team_last_k_scored.extend([-1]*10)
home_team_last_k_conceded.extend([-1]*10)
away_team_last_k_scored.extend([-1]*10)
away_team_last_k_conceded.extend([-1]*10)
continue
#create subset of games in the k-stages before
subset = season_df[season_df['stage'] >= i-k]
subset = subset[subset['stage'] < i]
cur_stage = season_df[season_df['stage'] == i]
home_teams = cur_stage['home_team_api_id'].tolist()
away_teams = cur_stage['away_team_api_id'].tolist()
match_api_id.extend(cur_stage['match_api_id'].tolist())
for team in home_teams:
prev_home = subset[subset['home_team_api_id'] == team]
prev_away = subset[subset['away_team_api_id'] == team]
#last-k-scored
prev_home_scored = prev_home['home_team_goal'][subset['home_team_api_id'] == team].tolist()
prev_away_scored = prev_away['away_team_goal'][subset['away_team_api_id'] == team].tolist()
#last-k-conceded
prev_home_conceded = prev_home['away_team_goal'][subset['home_team_api_id'] == team].tolist()
prev_away_conceded = prev_away['home_team_goal'][subset['away_team_api_id'] == team].tolist()
home_team_last_k_scored.append(sum(prev_home_scored)+sum(prev_away_scored))
home_team_last_k_conceded.append(sum(prev_home_conceded)+sum(prev_away_conceded))
for team in away_teams:
prev_home = subset[subset['home_team_api_id'] == team]
prev_away = subset[subset['away_team_api_id'] == team]
#last-k-scored
prev_home_scored = prev_home['home_team_goal'][subset['home_team_api_id'] == team].tolist()
prev_away_scored = prev_away['away_team_goal'][subset['away_team_api_id'] == team].tolist()
#last-k-conceded
prev_home_conceded = prev_home['away_team_goal'][subset['home_team_api_id'] == team].tolist()
prev_away_conceded = prev_away['home_team_goal'][subset['away_team_api_id'] == team].tolist()
away_team_last_k_scored.append(sum(prev_home_scored)+sum(prev_away_scored))
away_team_last_k_conceded.append(sum(prev_home_conceded)+sum(prev_away_conceded))
last_k_df = pd.DataFrame({'home_team_last_k_scored':home_team_last_k_scored,\
'home_team_last_k_conceded': home_team_last_k_conceded,\
'away_team_last_k_scored': away_team_last_k_scored,\
'away_team_last_k_conceded':away_team_last_k_conceded,\
'match_api_id':match_api_id})
return last_k_df
In [45]:
def CreateLastKFeature(df,lastk):
seasons = df['season'].unique()
count = 0
for year in seasons:
matches_season = matches[matches['season'] == year]
last_k_goals_df = LastKGoals(matches_season,lastk)
if count == 0:
home_k_scored = last_k_goals_df['home_team_last_k_scored'].tolist()
home_k_conceded = last_k_goals_df['home_team_last_k_conceded'].tolist()
away_k_scored = last_k_goals_df['away_team_last_k_scored'].tolist()
away_k_conceded = last_k_goals_df['away_team_last_k_conceded'].tolist()
match_id = last_k_goals_df['match_api_id'].tolist()
else:
home_k_scored.extend(last_k_goals_df['home_team_last_k_scored'].tolist())
home_k_conceded.extend(last_k_goals_df['home_team_last_k_conceded'].tolist())
away_k_scored.extend(last_k_goals_df['away_team_last_k_scored'].tolist())
away_k_conceded.extend(last_k_goals_df['away_team_last_k_conceded'].tolist())
match_id.extend(last_k_goals_df['match_api_id'].tolist())
count += 1
last_k_df = pd.DataFrame({'home_last_%d_scored'%(lastk):home_k_scored,\
'home_last_%d_conceded'%(lastk): home_k_conceded,\
'away_last_%d_scored'%(lastk): away_k_scored,\
'away_last_%d_concede'%(lastk): away_k_conceded,\
'match_api_id': match_id})
return last_k_df
In [46]:
## Create features for k = 2,3,4,5
start = time.time()
for k_value in range(2,6):
matches = matches.merge(CreateLastKFeature(matches,k_value), how='inner', on = 'match_api_id')
end = time.time()
diff = end-start
print 'Adding new k features took ' + str(round(diff,2)) + ' seconds'
print matches.shape
In [47]:
matches.tail()
Out[47]:
Lets check the results and make sure there aren't any missing values
In [48]:
null_dict = {}
for col in matches.columns:
nulls = matches[col].isnull().sum()
if nulls > 0:
null_dict[col] = nulls
null_dict
Out[48]:
Great! There aren't any nulls
In [50]:
#Let's do some investigation on the new features
cols = matches.columns[31:]
for col in cols:
matches[col].plot(kind='hist')
plt.title(col)
plt.show()
print matches[col].value_counts()
This concludes the 'last-k-goals' features!
The goal of this feature is to append a categorical feature 'Day of the Week' to each match. Because some matches happen during the middle of the week, maybe that could provide some information to the model.
In [54]:
dates_raw = matches['date'].tolist()
dates_split = []
for date in dates_raw:
dates_split.append(date.split(' ')[0])
dates_split[0]
Out[54]:
In [55]:
day_of_week = map(lambda x: time.strptime(x,'%m/%d/%y').tm_wday ,dates_split)
matches['day_of_week'] = day_of_week
In [56]:
matches['day_of_week'].plot(kind='hist') #Most matches played on saturday (day of week =5), but matches are also played on other days
Out[56]:
In [57]:
matches['day_of_week'].value_counts() #Most matches played on saturday (day of week =5), but matches are also played on other days
Out[57]:
In [58]:
# Let's re-investigate our matches dataframe
matches.tail()
Out[58]:
In [59]:
day_of_week_dummies = pd.get_dummies(matches['day_of_week'])
day_of_week_dummies.columns = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
#Add new binary columns to matches dataframe
matches['Monday'] = day_of_week_dummies.Monday
matches['Tuesday'] = day_of_week_dummies.Tuesday
matches['Wednesday'] = day_of_week_dummies.Wednesday
matches['Thursday'] = day_of_week_dummies.Thursday
matches['Friday'] = day_of_week_dummies.Friday
matches['Saturday'] = day_of_week_dummies.Saturday
matches['Sunday'] = day_of_week_dummies.Sunday
del matches['day_of_week']
In [60]:
matches.head()
Out[60]:
In [61]:
teams = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_teams.csv')
print teams.shape
teams.head()
Out[61]:
In [62]:
#Let's reduce the dataframe only to the columns we need
keepcols = ['team_api_id','team_long_name']
for col in teams.columns:
if col not in keepcols:
del teams[col]
teams.head()
Out[62]:
In [63]:
matches = matches.merge(teams,how = 'left', left_on = 'home_team_api_id', right_on = 'team_api_id' )
matches = matches.merge(teams,how = 'left', left_on = 'away_team_api_id', right_on = 'team_api_id')
matches.head()
Out[63]:
In [64]:
remove_cols = ['team_api_id_x','team_api_id_y']
for col in matches.columns:
if col in remove_cols:
del matches[col]
matches.rename(columns={'team_long_name_x': 'Home_Team', 'team_long_name_y': 'Away_Team'}, inplace=True)
In [65]:
matches.season.value_counts()
Out[65]:
In [66]:
print matches.head()
print matches.shape # We now have 56 total columns (some of them we will get rid of..)
In [67]:
team_attributes = pd.read_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/EPL_team_attributes.csv')
team_attributes.head()
Out[67]:
In [68]:
# Let's see what columns make up this dataframe
team_attributes.columns
Out[68]:
In [69]:
# Remove columns that we won't use
remove_cols = ['buildUpPlaySpeedClass','buildUpPlayDribbling','buildUpPlayPassingClass','chanceCreationPassingClass',\
'chanceCreationCrossingClass','chanceCreationShootingClass','defencePressureClass','defenceAggressionClass',\
'defenceTeamWidthClass','ID']
for col in team_attributes.columns:
if col in remove_cols:
del team_attributes[col]
team_attributes.shape ## Now we have just 16 columns
Out[69]:
In [70]:
# Split dates to get just the date component
dates_raw = team_attributes['date'].tolist()
dates_split = []
for date in dates_raw:
dates_split.append(date.split(' ')[0])
In [71]:
# Extract the year and month from each record and append to team_attributes dataframe
team_attributes['month'] = map(lambda x: time.strptime(x,'%m/%d/%y').tm_mon ,dates_split)
team_attributes['year'] = map(lambda x: time.strptime(x,'%m/%d/%y').tm_year ,dates_split)
In [72]:
def DetermineSeason(df):
if df['year'] == 2008:
return '2008/2009'
if df['year'] == 2009:
if df['month'] < 8:
return '2008/2009'
else:
return '2009/2010'
if df['year'] == 2010:
if df['month'] < 8:
return '2009/2010'
else:
return '2010/2011'
if df['year'] == 2011:
if df['month'] < 8:
return '2010/2011'
else:
return '2011/2012'
if df['year'] == 2012:
if df['month'] < 8:
return '2011/2012'
else:
return '2012/2013'
if df['year'] == 2013:
if df['month'] < 8:
return '2012/2013'
else:
return '2013/2014'
if df['year'] == 2014:
if df['month'] < 8:
return '2013/2014'
else:
return '2014/2015'
if df['year'] == 2015:
if df['month'] < 8:
return '2014/2015'
else:
return '2015/2016'
if df['year'] == 2016:
if df['month'] < 8:
return '2015/2016'
else:
return '2016/2017'
In [73]:
team_attributes['season'] = team_attributes.apply(DetermineSeason, axis = 1)
In [74]:
team_attributes.head(20)
Out[74]:
In [75]:
team_attributes['season'].unique() # By using team attributes, it eliminates data from 2008/2009 and 2012/2013 season! shoot..
Out[75]:
In [76]:
team_attributes['season'].value_counts()
Out[76]:
In [77]:
## Let's test merging the team features with the matches dataframe
matches = matches.merge(team_attributes, how = 'inner', left_on = ['home_team_api_id','season'], right_on = \
['team_api_id','season'])
matches.head()
Out[77]:
In [78]:
matches = matches.merge(team_attributes, how = 'inner', left_on = ['away_team_api_id','season'], right_on = \
['team_api_id','season'])
In [79]:
matches.shape # We have reduced the rows from 3040 to 2280...lost 25% of the data!.. We lose 2008/2009 and 2012/2013
Out[79]:
In [80]:
matches.season.value_counts()
Out[80]:
In [81]:
def Stage2Cat(df):
if df['stage'] < 22:
return 'Before Transfer Window'
else:
return 'After Transfer Window'
In [82]:
matches['Stage_Cat'] = matches.apply(Stage2Cat, axis = 1)
In [83]:
stage_cat_dummies = pd.get_dummies(matches['Stage_Cat'])
#del matches['Stage_Cat']
matches = pd.concat([matches,stage_cat_dummies], axis = 1)
The goal of this feature is to have a flag that indicates if a team was newly promoted that season
In [84]:
promoted_teams = {'2009/2010': ['Wolverhampton Wanderers','Birmingham City','Burnley'],\
'2010/2011': ['Newcastle United','West Bromwich Albion','Blackpool'],\
'2011/2012': ['Queens Park Rangers','Norwich City','Swansea City'],\
'2012/2013': ['Reading','Southampton','West Ham United'],\
'2013/2014': ['Cardiff City','Hull City','Crystal Palace'],\
'2014/2015': ['Leicester City','Burnley','Queens Park Rangers'],\
'2015/2016': ['AFC Bournemouth','Watford','Norwich City']}
In [85]:
def HomePromotedTeam(df):
seasonlist = matches['season'].unique().tolist()
for season in seasonlist:
if df['Home_Team'] in promoted_teams[season]:
return 1
else:
return 0
def AwayPromotedTeam(df):
seasonlist = matches['season'].unique().tolist()
for season in seasonlist:
if df['Away_Team'] in promoted_teams[season]:
return 1
else:
return 0
In [86]:
matches['Home_Promoted_Flag'] = matches.apply(HomePromotedTeam, axis=1)
matches['Away_Promoted_Flat'] = matches.apply(AwayPromotedTeam, axis = 1)
In [87]:
matches.shape
Out[87]:
In [91]:
major_teams = ['Manchester United','Arsenal','Chelsea','Manchester City','Liverpool']
In [97]:
def HomeMajorTeams(df):
if df['Home_Team'] in major_teams:
return 1
else:
return 0
def AwayMajorTeams(df):
if df['Away_Team'] in major_teams:
return 1
else:
return 0
In [98]:
matches['Home_Major_Team'] = matches.apply(HomeMajorTeams,axis=1)
matches['Away_Major_Team'] = matches.apply(AwayMajorTeams,axis=1)
In [100]:
for col in matches.columns:
if matches[col].dtype == 'object':
print col
In [101]:
def CreateDummies(df,col):
dummies = pd.get_dummies(df[col])
del df[col]
if col == 'Home_Team':
dummies.columns = dummies.columns +'_home_team'
elif col == 'Away_Team':
dummies.columns = dummies.columns +'_away_team'
else:
dummies.columns = dummies.columns + '_'+col
df = pd.concat([df,dummies], axis=1)
return df
In [102]:
dummy_cols = ['season','Home_Team','Away_Team','buildUpPlayDribblingClass_x',\
'buildUpPlayPositioningClass_x', 'chanceCreationPositioningClass_x',\
'defenceDefenderLineClass_x','buildUpPlayDribblingClass_y','buildUpPlayPositioningClass_y',\
'chanceCreationPositioningClass_y','defenceDefenderLineClass_y']
In [103]:
for dcol in dummy_cols:
matches = CreateDummies(matches,dcol)
matches.shape
Out[103]:
In [104]:
for col in matches.columns:
if matches[col].dtype == 'object':
print col
In [105]:
# Let's remove the date columns that we won't be using anymore
for col in matches.columns:
if 'date' in col:
del matches[col]
del matches['Stage_Cat']
matches.shape
Out[105]:
Last step - Remove all features that we don't want to use
In [106]:
remove_cols = ['id_x','match_api_id','home_team_api_id','away_team_api_id',\
'month_x','year_x','id','team_fifa_api_id_y','team_api_id_y',\
'id_y','team_fifa_api_id_x','team_api_id_x','month_y','year_y']
for col in remove_cols:
try:
del matches[col]
except Exception:
continue
matches.shape
Out[106]:
In [107]:
# Finally export matches dataframe for modeling
matches.to_csv('/Users/mtetkosk/Google Drive/Data Science Projects/data/processed/20170217_Matches_w_Features.csv',index=None)
In [72]:
from scipy.stats import pearsonr
pearsonr(matches['home_last_4_scored'],matches['Result_Target'])
Out[72]:
In [77]:
pearsonr(matches['Diff_Goal_Diff'],matches['Result_Target'])
Out[77]:
In [110]:
from scipy.stats import pearsonr
In [111]:
metric_list = ['home_last_%d_scored','home_last_%d_conceded','away_last_%d_scored','away_last_%d_concede']
count = 0
for metric in metric_list:
corr_list = []
count +=1
for i in range(2,6):
corr_list.append(pearsonr(matches[metric%i],matches['Result_Target'])[0])
plt.plot(range(2,6),corr_list)
plt.xlabel('Number of Last K Matches')
plt.ylabel('Correlation with Result')
if count == 1:
plt.title('Home Last K Scored')
if count == 2:
plt.title('Home Last K Conceded')
if count == 3:
plt.title('Away Last K Scored')
if count == 4:
plt.title('Away Last K Conceded')
plt.show()
In [121]:
metric_list = ['Home_Promoted_Flag', 'Away_Promoted_Flat','Standing_Diff','Diff_Goal_Diff']
count = 0
corr_list = []
for metric in metric_list:
corr_list.append(pearsonr(matches[metric],matches['Result_Target'])[0])
In [122]:
print 'Home Promoted Flag Correlation with Target= ' + str(round(corr_list[0],2))
print 'Away Promoted Flag Correlation with Target= ' + str(round(corr_list[1],2))
print 'League Standing Difference Correlation with Target= ' + str(round(corr_list[2],2))
print 'Difference in Goal Diff Correlation with Target= ' + str(round(corr_list[3],2))
In [118]:
metric_list = ['Home_Team_Last_%d_Wins','Home_Team_Last_%d_Draws','Home_Team_Last_%d_Losses','Away_Team_Last_%d_Wins','Away_Team_Last_%d_Draws','Away_Team_Last_%d_Losses']
count = 0
for metric in metric_list:
corr_list = []
count +=1
for i in range(1,10):
corr_list.append(pearsonr(matches[metric%i],matches['Result_Target'])[0])
plt.plot(range(1,10),corr_list)
plt.xlabel('Number of Last K Matches')
plt.ylabel('Correlation with Result')
if count == 1:
plt.title('Home Last K Wins')
if count == 2:
plt.title('Home Last K Draws')
if count == 3:
plt.title('Home Last K Losses')
if count == 4:
plt.title('Away Last K Wins')
if count == 5:
plt.title('Away Last K Draws')
if count == 6:
plt.title('Away Last K Losses')
plt.show()
In [ ]: